# @shuaige : 陈世玉
# @name :spider2.py
# @time :2024/12/8 13:53
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def common_browser():
    chrome_options = Options()
    chrome_options.add_argument('--headless')#引入无头模式
    # path是自己电脑的Chrome浏览器文件地址
    path = r'C:\Users\陈世玉\AppData\Local\Google\Chrome\Application\chrome.exe'
    chrome_options.binary_location = path
    browser = webdriver.Chrome(options=chrome_options)
    return browser
browser = common_browser()
url='https://www.shanghairanking.cn/rankings/bcur/2024'
browser.get(url)
i=0
#1.进行文件写入
with open ('data2.txt','w',encoding='utf-8') as f:
    while i <17 :#爬取17页数据
        i+=1
        print("正在爬取第"+str(i)+"页")
        #2.分别获取网页中的数据
        rankings=browser.find_elements_by_xpath('//td[1]')#爬取排名
        names=browser.find_elements_by_xpath("//td[2]//div/span[@class='name-cn']")#爬取学校名称  加上//div/span[@class='name-cn']只获取学校名称
        regions=browser.find_elements_by_xpath('//td[3]')#爬取地区
        schooltypes=browser.find_elements_by_xpath('//td[4]')#爬取学校类型
        scores=browser.find_elements_by_xpath('//td[5]')#爬取总分
        levels=browser.find_elements_by_xpath('//td[6]')#爬取办学层次
        for ranking,name,region,schooltype,score,level in zip(rankings,names,regions,schooltypes,scores,levels):
            f.write(ranking.text+' '+name.text+' '+region.text+' '+schooltype.text+' '+score.text+' '+level.text+'\n')
            print(ranking.text+' '+name.text+' '+region.text+' '+schooltype.text+' '+score.text+' '+level.text)

        browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(2)
        # 显式等待，直到 "下一页" 按钮可点击
        next_button = WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, ".ant-pagination-next"))
        )
        # 点击 "下一页" 按钮
        next_button.click()
        time.sleep(2)

browser.quit()
